* This script merges together the different study datsets into a single analytical file. Those datasets include:
* HR data on absenteeism and employment
* Health insurance claims
* Baseline and follow-up surveys
* Treatment group status
* Participation status (screening, HRA, wellness activities)
* HR data on baseline characteristics
* Marathon participation data

clear
set more off
program drop _all

* All required add-ons are stored in the /packages and /auxiliary folders
adopath ++ "$Wellness_WhatDoesWWDo/scripts/packages"
adopath ++ "$Wellness_WhatDoesWWDo/scripts/auxiliary"
mata: mata mlib index

**************************************************************************************
**** Standardized cleaning code for baseline and follow-up surveys
**************************************************************************************

program define clean_survey_data

	assert inlist(Gender,"M","F")
	gen male = Gender=="M"
	drop Gender

	* Keep only respondents who completed the whole survey. 
	keep if FinalDisposition_x10000=="Complete":FinalDisposition_x10000

	***
	* 1. Ever had a screening? (7 questions)
	***
	local HealthScreeningVars "HealthScreening_Cholesterol HealthScreening_BloodSugarF HealthScreening_BloodSugarM"
	local WomensScreeningVars "WomensScreening_PapTest WomensScreening_Mammogram"
	local Over50ScreeningVars "Over50Screening_Colonoscopy Over50Screening_Prostate"

	* 1="Yes", 0="No". Assume that missings are equal to 0, so that missing a screening question does not cause everscreen to be missing. But, if every var is missing, then set everscreen to missing. (Never happens.)
	gen everscreen = 0
	gen nonmissing = 0
	foreach v in  `HealthScreeningVars' `WomensScreeningVars' `Over50ScreeningVars'  {
		assert inlist(`v',1,0,.)
		replace nonmissing = 1 if !mi(`v')
		replace everscreen = 1 if `v'==1
	}
	assert nonmissing==1
	drop nonmissing
	label var everscreen "Had at least 1 previous health screening"

	***
	* 2. Physically active (no missings here)
	***
	assert inlist(Exercise_ActivityLevelComparison,1,2,3)
	gen active = Exercise_ActivityLevelComparison=="More active":Exercise_ActivityLevelComparison
	label var active "Physically active"

	***
	* 3. Trying to be more active
	***
	assert inlist(Exercise_DrRecIncreaseActivity  ,0,1,.)
	assert inlist(Exercise_TryingToIncreaseActivit,0,1,.)
	gen active_try = 0
	foreach v in Exercise_DrRecIncreaseActivity Exercise_TryingToIncreaseActivit {
		replace active_try = 1 if `v'==1
	}
	replace active_try = . if mi(Exercise_DrRecIncreaseActivity) & mi(Exercise_TryingToIncreaseActivit)
	label var active_try "Trying to be more active"

	***
	* 4. Cigarette smoking status. Everyone answered the ever smoking question, but one person did not answer the current smoking question (2016)
	***
	gen cursmk  = .
	gen formsmk = .

	assert inlist(Cigarettes_100Lifetime,0,1,.)
	gen eversmk = Cigarettes_100Lifetime

	replace cursmk  = 1 if eversmk==1 & (Cigarettes_CurrentFrequency=="Some days":Cigarettes_CurrentFrequency | Cigarettes_CurrentFrequency=="Every day":Cigarettes_CurrentFrequency)
	replace formsmk = 1 if eversmk==1 & Cigarettes_CurrentFrequency=="Not at all":Cigarettes_CurrentFrequency

	replace cursmk  = 0 if eversmk==0 | formsmk==1
	replace formsmk = 0 if eversmk==0 | cursmk==1

	assert cursmk ==0 if formsmk==1 & !mi(cursmk)
	assert formsmk==0 if cursmk ==1 & !mi(formsmk)
	assert mi(cursmk) & mi(formsmk) if eversmk==.

	label var cursmk "Current smoker"
	label var formsmk "Former smoker"
	

	***
	* 5. Other tobacco use. As with cigarettes, one person did not answer (2016).
	***
	gen othersmk = 0
	foreach v in OtherTobacco_OtherProducts OtherTobacco_ECigarettes {
		assert inlist(`v',1,2,3,.)
		replace othersmk = 1 if `v'!="Not at all":`v' & !mi(`v')
	}
	replace othersmk = . if mi(OtherTobacco_OtherProducts) & mi(OtherTobacco_ECigarettes)
	label var othersmk "Other (non-cigarette) tobacco use"

	***
	* 6. Drinking
	***

	* Drinker is anybody who drank at least 1 day "in the last 7 days"
	assert inrange(Alcohol_7DaysDaysConsumedAny,0,7) if !mi(Alcohol_7DaysDaysConsumedAny)
	gen     drink = 1 if inrange(Alcohol_7DaysDaysConsumedAny,1,7)
	replace drink = 0 if Alcohol_7DaysDaysConsumedAny==0
	label var drink "Drinker"

	assert inrange(Alcohol_7DaysNumConsumedAvg,1,6) if !mi(Alcohol_7DaysNumConsumedAvg)
	assert inlist(male,0,1)

	* Heavy drinker: >=4 drinks/day for females, >=5 drinks/day for males (from pre-analysis plan)
	gen     drinkhvy = 1 if drink==1 & inrange(Alcohol_7DaysNumConsumedAvg,4,6) & male==0
	replace drinkhvy = 1 if drink==1 & inrange(Alcohol_7DaysNumConsumedAvg,5,6) & male==1

	replace drinkhvy = 0 if drink==1 & inrange(Alcohol_7DaysNumConsumedAvg,1,3) & male==0
	replace drinkhvy = 0 if drink==1 & inrange(Alcohol_7DaysNumConsumedAvg,1,4) & male==1
	replace drinkhvy = 0 if drink==0
	label var drinkhvy "Heavy drinker"

	assert mi(drinkhvy) if mi(drink)

	***
	* 7. Has at least one chronic health condition. Equal to missing if missing every single one of these respones (i.e., did not select "None at all"
	***
	gen chronic = 0
	gen allmissing = 1
	foreach v in Diabetes Asthma HypertensionHighBloodP ChronicBackPain HighCholesterol HeartAttackHeartDiseas EmphysemaCOPD CongestiveHeartFailure WeakFailingKidneys CancerMalignancy DepressionAnxiety Arthritis SinusitisRhinitis Allergies OtherSelect {
		assert inlist(Diagnosed_`v',0,1,.)
		replace chronic=1 if Diagnosed_`v'==1
		replace allmissing = 0 if !mi(Diagnosed_`v')
	}
	replace chronic = . if allmissing==1
	drop allmissing
	label var chronic "Has at least 1 chronic condition"

	***
	* 8. Self-reported health (never missing): Poor, Fair, Good, Very Good, Excellent
	***
	assert inrange(GeneralHealth_4WkHealthRating,1,5)

	gen health1 = GeneralHealth_4WkHealthRating=="Very Good":GeneralHealth_4WkHealthRating | GeneralHealth_4WkHealthRating=="Excellent":GeneralHealth_4WkHealthRating
	gen health2 = GeneralHealth_4WkHealthRating!="Poor":GeneralHealth_4WkHealthRating

	label var health1 "Health is excellent or very good"
	label var health2 "Health is not poor"

	***
	* 9. Problems with physical activities or pain (never missing)
	***
	assert !mi(GeneralHealth_PhysicalHealthLimi) & !mi(GeneralHealth_DifficultyDailyWor) & !mi(GeneralHealth_4WkBodilyPain)

	gen problems=0

	replace problems = 1 if inlist(GeneralHealth_PhysicalHealthLimi,"Somewhat":GeneralHealth_PhysicalHealthLimi,"Quite a lot":GeneralHealth_PhysicalHealthLimi,"Could not do physical activities":GeneralHealth_PhysicalHealthLimi)
	replace problems = 1 if inlist(GeneralHealth_DifficultyDailyWor,"Some":GeneralHealth_DifficultyDailyWor,"Quite a lot":GeneralHealth_DifficultyDailyWor,"Could not do daily work":GeneralHealth_DifficultyDailyWor)
	replace problems = 1 if inlist(GeneralHealth_4WkBodilyPain,"Mild":GeneralHealth_4WkBodilyPain,"Moderate":GeneralHealth_4WkBodilyPain,"Severe":GeneralHealth_4WkBodilyPain,"Very severe":GeneralHealth_4WkBodilyPain)

	label var problems "Problems with physical activities or pain"

	***
	* 10. Energy (never missing)
	***
	assert inrange(GeneralHealth_4WkEnergyRating,1,5) if !mi(GeneralHealth_4WkEnergyRating)

	gen energy = GeneralHealth_4WkEnergyRating=="An extraordinary amount":GeneralHealth_4WkEnergyRating | GeneralHealth_4WkEnergyRating=="Quite a lot":GeneralHealth_4WkEnergyRating
	replace energy = . if mi(GeneralHealth_4WkEnergyRating)

	label var energy "Lots of energy"

	***
	* 11. Emotional health
	***
	assert inrange(EmotionalHealth_4WkRating,1,5) if !mi(EmotionalHealth_4WkRating)

	gen ehealth = inlist(EmotionalHealth_4WkRating,"Moderately":EmotionalHealth_4WkRating,"Quite a lot":EmotionalHealth_4WkRating,"Extremely":EmotionalHealth_4WkRating)
	replace ehealth = . if mi(EmotionalHealth_4WkRating)

	label var ehealth "Emotional health"

	***
	* 12. Overweight status (never missing)
	***
	assert inrange(HealthPerceptions_BodyWeight,1,5) if !mi(HealthPerceptions_BodyWeight)
	gen overweight = inlist(HealthPerceptions_BodyWeight,"Overweight":HealthPerceptions_BodyWeight,"Very overweight":HealthPerceptions_BodyWeight) if !mi(HealthPerceptions_BodyWeight)

	label var overweight "Overweight status"

	***
	* 13. Bad health status (never missing). Treat "I don't know"=9998 same as "low"=1 or "normal"=2
	***

	assert inlist(HealthPerceptions_BloodPressureL,1,2,3,4,9998,.)
	assert inlist(HealthPerceptions_CholesterolLev,1,2,3,4,9998,.)
	assert inlist(HealthPerceptions_BloodGlucoseLe,1,2,3,4,9998,.)

	gen badhealth = 0
	replace badhealth = 1 if inlist(HealthPerceptions_BloodPressureL,"High (pre-hypertensive)":HealthPerceptions_BloodPressureL,"Very high (hypertensive)":HealthPerceptions_BloodPressureL)
	replace badhealth = 1 if inlist(HealthPerceptions_CholesterolLev,"High":HealthPerceptions_CholesterolLev,"Very high":HealthPerceptions_CholesterolLev)
	replace badhealth = 1 if inlist(HealthPerceptions_BloodGlucoseLe,"High":HealthPerceptions_BloodGlucoseLe,"Very high":HealthPerceptions_BloodGlucoseLe)
	replace badhealth = . if mi(HealthPerceptions_BloodPressureL) & mi(HealthPerceptions_CholesterolLev) & mi(HealthPerceptions_BloodGlucoseLe)

	label var badhealth "High blood pressure, cholesterol, or glucose"

	***
	* 14. Sedentary job: less than 1 hour of standing or walking around
	***

	gen sedentary = inlist(Workplace_JobStandingWalkingAvgF,"None at all":Workplace_JobStandingWalkingAvgF,"Some, but less than 1 hour":Workplace_JobStandingWalkingAvgF)
	replace sedentary = . if mi(Workplace_JobStandingWalkingAvgF)

	label var sedentary "Sedentary job"

	***
	* Health cost and utilization measures
	***

	* 1. Current drug use
	gen druguse = Healthcare_NumPrescriptionsMeds>0 & !mi(Healthcare_NumPrescriptionsMeds)

	replace druguse = 1 if Healthcare_NumOTCMeds>0 & !mi(Healthcare_NumOTCMeds)

	replace druguse = . if mi(Healthcare_NumPrescriptionsMeds) & mi(Healthcare_NumOTCMeds)
	label var druguse "Taking 1+ prescription/OTC drugs"

	* 2. At least 1 Physician/ER utilization in past 6 months
	gen     physician = 0
	replace physician = 1 if Healthcare_6MoClinicEmergencyFre!="None":Healthcare_6MoClinicEmergencyFre
	replace physician = . if mi(Healthcare_6MoClinicEmergencyFre)

	label var physician "Physician or ER utilization"

	* 3. At least 1 overnight hospital visit (excluding births) in past 6 months
	gen     hospital = 0
	replace hospital = 1 if Healthcare_6MoHospitalOvernights!="None":Healthcare_6MoHospitalOvernights
	replace hospital = . if mi(Healthcare_6MoHospitalOvernights)

	label var hospital "Hospital utilization"

	***
	* Absenteeism measures
	***
	gen sickdays = Workplace_12MoNumDaysMissed>0 if !mi(Workplace_12MoNumDaysMissed)

	label var sickdays "Sick days in past 12 months (baseline survey)"

	***
	* Productivity measures
	***
	
	* Worked 50+ hours/week
	gen hrsworked50 = Workplace_AvgHoursWorked=="50 or more":Workplace_AvgHoursWorked if !mi(Workplace_AvgHoursWorked)
	label var hrsworked50 "Worked 50 or more hours per week"

	* Job satisfication
	gen jobsatisf1 = inlist(Workplace_JobSatisfaction,"Very satisfied":Workplace_JobSatisfaction) 										         if !mi(Workplace_JobSatisfaction)
	gen jobsatisf2 = inlist(Workplace_JobSatisfaction,"Very satisfied":Workplace_JobSatisfaction,"Somewhat satisfied":Workplace_JobSatisfaction) if !mi(Workplace_JobSatisfaction)
	label var jobsatisf1 "Very satisfied with job"
	label var jobsatisf2 "Very or somewhat satisfied with job"

	* Workplace perceptions
	gen mgmtsafety = inlist(Workplace_ManagementHealthSafety,"Very high priority":Workplace_ManagementHealthSafety, "Some priority":Workplace_ManagementHealthSafety) if !mi(Workplace_ManagementHealthSafety)
	label var mgmtsafety "Management places very high or some priority on health/safety"	

end


* These variables were only available in the 2017+ surveys
program define clean_survey_data_2017

	* 0 = no, 1 = yes, 2- I don't know what iThrive is
	gen iThrive_talk = iThrive_TalktoCoworkers
	recode iThrive_talk (2=0)
	label var iThrive_talk "Discussed iThrive with coworkers over past year"

	* Happy at work
	gen happywork = Workplace_Happier=="Yes":Workplace_Happier if !mi(Workplace_Happier)
	label var happywork "Happier at work than last year"

	* Presenteeism. Koopman et al (2002): "The sum of the six items then produces a total Presenteeism Score." Values range from 1-5, so max score is 30 indicating 100% presenteeism (good).
	gen presenteeism = 0
	qui foreach v in Workplace_AbleToFinishTasks Workplace_FocusOnGoals Workplace_Energetic Workplace_Stresses Workplace_DistractedfromPleasure Workplace_Hopeless {
		
		* Only a small number of missing responses
		assert inlist(`v',"Strongly disagree":`v',"Somewhat disagree":`v',"Somewhat agree":`v',"Strongly agree":`v',"Not applicable":`v') if !mi(`v')
		count if mi(`v')
		assert r(N)<=5

		* In Koopman et al study, "uncertain" is assigned a 3. So for the few missing observations, assign them a 3.
		* "Not applicable" individuals presumably do not have disability or poor health, and thus will be coded as high presenteeism
		replace presenteeism = presenteeism+1 if `v'=="Strongly agree":`v'
		replace presenteeism = presenteeism+2 if `v'=="Somewhat agree":`v'
		replace presenteeism = presenteeism+3 if mi(`v')
		replace presenteeism = presenteeism+4 if `v'=="Somewhat disagree":`v'
		replace presenteeism = presenteeism+5 if `v'=="Strongly disagree":`v'
		replace presenteeism = presenteeism+5 if `v'=="Not applicable":`v'
	}
	assert inrange(presenteeism,6,30)
	label var presenteeism "SPS-6 (Stanford Presenteeism Scale, range 6-30)"

	* Productivity. Original pre-analysis plan was "Very productive" or "somewhat productive". But, then over 90% of sample has same response. Per pre-analysis plan note on "limited variation", limit this to just "Very productive"
	* Note: promotion question was mislabeled as "G56" on the pre-analysis plan, but actualy it is G57.
	gen productive = inlist(Workplace_Productive,"Very productive":Workplace_Productive) if !mi(Workplace_Productive)
	gen promotion = Workplace_Promotion=="Yes":Workplace_Promotion                       if !mi(Workplace_Promotion)
	label var productive "Feel very productive at work"
	label var promotion "Received promotion or more responsibility"

	* Job search
	gen jobsearch1 = inlist(Workplace_LookingForNewJob,"Very likely":Workplace_LookingForNewJob) 											  if !mi(Workplace_LookingForNewJob)
	gen jobsearch2 = inlist(Workplace_LookingForNewJob,"Very likely":Workplace_LookingForNewJob,"Somewhat likely":Workplace_LookingForNewJob) if !mi(Workplace_LookingForNewJob)
	label var jobsearch1 "Very likely to search for new job"
	label var jobsearch2 "Very or somewhat likely to search for new job"
end



**************************************************************************************
**** Absenteeism and employment variables from HR administrative data
**************************************************************************************
tempfile employment hr_absenteeism_data

***
* Employment start and termination dates
***
use "$Wellness_WhatDoesWWDo/data/raw/Employment-2017-08-16.dta", clear

rename (EmploymentStatus TerminationDate TerminationReason TerminationReasonDesc FTE Salary PrimaryTitleID) (status2017 term_date2017 term_reason2017 term_reason_desc2017 fte2017 salary_0717 title_0717)

merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/Employment-2019-02-01.dta", assert(match) nogenerate
rename (EmploymentStatus TerminationDate TerminationReason TerminationReasonDesc FTE Salary PrimaryTitleID) (status2018 term_date2018 term_reason2018 term_reason_desc2018 fte2018 salary_0119 title_0119)

* One small discrepancy in termination dates -- defer to the newer file
assert term_date2017==mdy(6,14,2017) if AnalysisID==111759
assert term_date2018==mdy(2,6,2017)  if AnalysisID==111759
replace term_date2017=term_date2018  if AnalysisID==111759

assert term_date2017<=term_date2018 if !mi(term_date2017)
assert term_date2018<=mdy(2,1,2019) if !mi(term_date2018)

* Consistent with baseline HR data, set $0 salaries equal to missing
replace salary_0717=. if salary_0717<1
replace salary_0119=. if salary_0119<1

* Merge on hire date, baseline salary, and baseline title information
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/HR_Baseline.dta", assert(match using) nogenerate keepusing(CurrentHireDate JobSalary)
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/Employment-2016-06-16.dta", assert(match using) nogenerate keepusing(PrimaryTitleID)
ren PrimaryTitleID title_0616

keep AnalysisID term_date2018 salary* title* CurrentHireDate JobSalary
ren term_date2018 TerminationDate
save "`employment'", replace

***
* Civil service leave data. These are in panel form
***
use "$Wellness_WhatDoesWWDo/data/raw/CS-leave-2019-02-08.dta", clear

* SICK is cumulative (rolls over), and SICC is compensable (i.e., can be cashed out). Add them together to get total sick leave.
gen sickleave_cs = CS_Taken_SICK + CS_Taken_SICC
assert !mi(sickleave_cs)

* Calculate sick leave for the following time periods: (1) August 2015 - July 2016; (2) August 2016 - July 2017; (3) August 2016 - January 2019
* Note: sick leaves are recorded by pay period, so "ActivityMonth" does not ncesssarily span exactly one month.
bys AnalysisID: egen sickleave_cs_0815_0716 = sum(sickleave_cs) if inrange(ActivityMonth,ym(2015,08),ym(2016,07))
bys AnalysisID: egen sickleave_cs_0816_0717 = sum(sickleave_cs) if inrange(ActivityMonth,ym(2016,08),ym(2017,07))
bys AnalysisID: egen sickleave_cs_0816_0119 = sum(sickleave_cs) if inrange(ActivityMonth,ym(2016,08),ym(2019,01))

collapse (mean) sickleave_cs_*, by(AnalysisID) fast

* Two cases of negative total sick leave. Is likely due to a manual inter-temporal adjustment. Set equal to 0
count if sickleave_cs_0815_0716<0
assert r(N)==1
replace sickleave_cs_0815_0716=0 if sickleave_cs_0815_0716<0

count if sickleave_cs_0816_0717<0
assert r(N)==1
replace sickleave_cs_0816_0717=0 if sickleave_cs_0816_0717<0

count if sickleave_cs_0816_0119<0
assert r(N)==0

***
* Academic sick leave data (for AP and Faculty). These are simple totals for various time periods (i.e., not a panel)
***

merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/ACAD-leave-2018-08-15.dta", assert(match) nogenerate

* Academic leave is recorded for the following time periods (1) August 16, 2015 - August 15, 2016; (2) August 16, 2016 - August 15, 2017; and  (3) August 16, 2016 - August 15, 2018
* Total sick leave is sum of SICK (cumulative, i.e., rolls over past academic year cycle of August-August), SICN (non-cumulative, does not roll over), and SICC (compensable, very rare) 
gen sickleave_acad_0815_0816 =                            ACAD_Taken_SICK_0815_0816 + ACAD_Taken_SICN_0815_0816 + ACAD_Taken_SICC_0815_0816
gen sickleave_acad_0816_0817 = 						      ACAD_Taken_SICK_0816_0817 + ACAD_Taken_SICN_0816_0817 + ACAD_Taken_SICC_0816_0817
gen sickleave_acad_0816_0818 = sickleave_acad_0816_0817 + ACAD_Taken_SICK_0817_0818 + ACAD_Taken_SICN_0817_0818 + ACAD_Taken_SICC_0817_0818

assert sickleave_acad_0815_0816>=0
assert sickleave_acad_0816_0817>=0
assert sickleave_acad_0816_0818>=0


* 41 out of 12,459 people have records in both the CS and academic systems the one-year pre- and post-periods
count if (sickleave_acad_0815_0816>0 & sickleave_cs_0815_0716 >0) | (sickleave_acad_0816_0817 >0 & sickleave_cs_0816_0717 >0)
assert r(N)==41


* Final variable sums total sick leave for academic and civil service.
* Note that the start dates for academic leave do not line up exactly (off by about 15 days)
* Academic data are only recorded in May and August of each year --> 0816_0119 variable is thus incomplete for them
gen sickleave_0815_0716 = sickleave_cs_0815_0716 + sickleave_acad_0815_0816
gen sickleave_0816_0717 = sickleave_cs_0816_0717 + sickleave_acad_0816_0817
gen sickleave_0816_0119 = sickleave_cs_0816_0119 + sickleave_acad_0816_0818
label var sickleave_0816_0119 "Note: 0 for academics between 8/18 and 01/19"

assert sickleave_0815_0716>=0
assert sickleave_0816_0717>=0
assert sickleave_0816_0119>=0

* There are occasional (rare) adjustments to sick leave, which show up as offsetting negative numbers, perhaps to undo a previous entry mistake? This makes it possible to have total sick leave go down over short periods of time.
count if sickleave_0816_0717 > sickleave_0816_0119
assert r(N)==2
assert sickleave_0816_0717 <= sickleave_0816_0119+4

***
* Merge on employment dates and salary information
***
merge 1:1 AnalysisID using "`employment'", assert(match) nogenerate

* One employee has a termination date of 9/14/2015 ("resigned"), but took 60 days of sickleave in the 2016-2017 first year.
* Note: this person is in treatment group and signed up for activities. Also took follow-up survey.
* This person did NOT have a termination date in the one-year (2016-2017) job status file.
* Overall, this seems like a typo. They probably resigned in 2017, not 2015.
assert TerminationDate==mdy(9,14,2015)                  if AnalysisID==102424
assert sickleave_0816_0717>0 & !mi(sickleave_0816_0717) if AnalysisID==102424
replace TerminationDate=mdy(9,14,2017)                  if AnalysisID==102424

* Some people were hired after the baseline survey ended. Confirm they have no sick leave in the pre-period
assert sickleave_0815_0716==0 if CurrentHireDate>mdy(8,1,2016)

* Some people were terminated before baseline survey closed. Confirm they did not take sick leave during post-period (allowing for two-week buffer)
assert sickleave_0816_0717==0 if TerminationDate<mdy(7,16,2016)
assert sickleave_0816_0119==0 if TerminationDate<mdy(7,16,2016)

* Pre-period: Calculate how many days people are eligible for sick leave (the denominator)
* 12 employees have hire dates after 7/31/2016 -- set their days employed = 0 in pre-period
assert !mi(CurrentHireDate)
gen sickdays_eligible_0815_0716 = mdy(7,31,2016) - max(CurrentHireDate,mdy(8,1,2015))
count if sickdays_eligible_0815_0716<=0
assert r(N)==12
replace sickdays_eligible_0815_0716 = 0 if sickdays_eligible_0815_0716<0

* Post period calculations cover more than 365 days/yr (because we want to capture the 8/15 job separations). But the sick leave data only cover 365 days at most, so topcode this measure.
gen sickdays_eligible_0816_0717 = min(mdy(8,15,2017),TerminationDate) - mdy(8,1,2016)
assert !mi(sickdays_eligible_0816_0717)
replace sickdays_eligible_0816_0717 = 365 if sickdays_eligible_0816_0717>365

gen sickdays_eligible_0816_0119 = min(mdy(1,31,2019),TerminationDate) - mdy(8,1,2016)
assert !mi(sickdays_eligible_0816_0119)
replace sickdays_eligible_0816_0119 = 365*2+169 if sickdays_eligible_0816_0119>365*2+169

* 388 employees were terminated before 8/1/2016 -- set their days employed = 0 in post-period
foreach t in 0816_0717 0816_0119 {
	count if sickdays_eligible_`t'<0
	assert r(N)==388
	replace sickdays_eligible_`t' = 0 if sickdays_eligible_`t'<0
}


* QC people with 0 days employed in pre- or post-period: How many are study participants?
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/TreatmentAssignment-2016.dta", assert(match) nogenerate

* 5 people in the study have hire dates between 8/1/2016 and 8/16/2016
count if Randomized==1 & sickdays_eligible_0815_0716==0
assert r(N)==5
assert inrange(CurrentHireDate,mdy(8,1,2016),mdy(8,16,2016)) if Randomized==1 & sickdays_eligible_0815_0716==0

* 52 people in the study have termination dates between 2/29/16 and 7/31/16
count if Randomized==1 & sickdays_eligible_0816_0717==0
assert r(N)==52
assert inrange(TerminationDate,mdy(2,29,2016),mdy(7,31,2016)) if Randomized==1 & sickdays_eligible_0816_0717==0

* Length of 1-yr post period = 365 days per year. Length of pre-period = 365 days
assert inrange(sickdays_eligible_0816_0119,0,365*2+169)
assert inrange(sickdays_eligible_0816_0717,0,365)
assert inrange(sickdays_eligible_0815_0716,0,365)
gen employed_0717 = !(TerminationDate<=mdy(8,15,2017))
gen employed_0119 = !(TerminationDate<=mdy(1,31,2019))

assert employed_0717==1 if employed_0119==1

* Baseline 2016 data contained 238 missing salaries among "active" employees. This has slowly increased over time
count if mi(JobSalary)
assert r(N)==238
count if mi(salary_0717) & employed_0717
assert r(N)==294
count if mi(salary_0119) & employed_0119
assert r(N)==489

* 4 CS employees have sickleave in the post-period despite not being employed then. Set their sickleave values equal to 0
foreach t in 0816_0717 0816_0119 {
	count if sickleave_`t'>0 & sickdays_eligible_`t'==0
	assert r(N)==4
	replace sickleave_`t'=0 if sickdays_eligible_`t'==0	
}

* Normalize sick leave from total sick hours --> sick days per (fraction of employed) year, where day is defined as 8 hours.
foreach t in 0815_0716 0816_0717 0816_0119 {
	assert sickleave_`t'==0 if sickdays_eligible_`t'==0
	replace sickleave_`t' = (sickleave_`t'/8) / (sickdays_eligible_`t'/365) if sickdays_eligible_`t'!=0
}

* Outliers should be rare, and only occur for people with short eligiblities of 3 months or less. (eg, taking lots of sick days in the final month of work.)
foreach t in 0815_0716 0816_0717 0816_0119 {
	count if sickleave_`t' > 365
	assert r(N)<2
	assert sickdays_eligible_`t' <31 if sickleave_`t' > 365
}

* Create terminated variable
gen terminated_0717 = 1 - employed_0717
gen terminated_0119 = 1 - employed_0119

label var sickleave_0815_0716 "Sick leave (days/year), 8/15 - 7/16"
label var sickleave_0816_0717 "Sick leave (days/year), 8/16 - 7/17"
label var sickleave_0816_0119 "Sick leave (days/year), 8/16 - 1/19, assumes 0 for non-CS for 8/18-1/19."
label var sickdays_eligible_0815_0716 "Days eligible for sick leave (denominator), 8/15 - 7/16"
label var sickdays_eligible_0816_0717 "Days eligible for sick leave (denominator), 8/16 - 7/17"
label var sickdays_eligible_0816_0119 "Days eligible for sick leave (denominator), 8/16 - 7/18"
label var employed_0717 "Active employment as of 8/15/17"
label var employed_0119 "Active employment as of 1/31/19"
label var terminated_0717 "Not employed as of 8/15/17 = 1-employed_0717"
label var terminated_0119 "Not employed as of 1/31/19 = 1-employed_0119"

assert !mi(sickleave_0815_0716) & !mi(sickleave_0816_0717) & !mi(sickleave_0816_0119) & !mi(sickdays_eligible_0815_0716) & !mi(sickdays_eligible_0816_0717) & !mi(sickdays_eligible_0816_0119)

keep AnalysisID sickleave_0* sickdays_eligible* employed_* salary_* title_* terminated*

compress
save "`hr_absenteeism_data'", replace

**************************************************************************************
**** Healthcare claims data
**************************************************************************************
tempfile claims_data
use "$Wellness_WhatDoesWWDo/data/raw/Claims-2019-03-31", clear

* Use "place of service code" to categorize spending and utilization into: prescription drugs, office care, hospital care, and other
* Hospital place of service defined as: Off Campus-Outpatient Hospital + Inpatient Hospital + On Campus-Outpatient Hospital + ER - Hospital

* Spending measures
egen spend    = rowtotal(HA_Allowed_POS_*), missing
gen spendRx   = HA_Allowed_POS_1
gen spendOff  = HA_Allowed_POS_11
gen spendHosp = HA_Allowed_POS_19 + HA_Allowed_POS_21 + HA_Allowed_POS_22 + HA_Allowed_POS_23
gen spendOthr = spend - (spendRx + spendOff + spendHosp)

* Visit measures
gen visitRx   = HA_ClaimDays_POSGRP_Rx
gen visitOff  = HA_ClaimDays_POSGRP_Office
gen visitHosp = HA_ClaimDays_POSGRP_Hospital
gen visitOthr = HA_ClaimDays_POSGRP_Other

* Confirm that aggregated spending measures, calculated in the original data processing script, are the same
assert abs(spendRx  -HA_Allowed_POSGRP_Rx)      < .01
assert abs(spendOff -HA_Allowed_POSGRP_Office)  < .01
assert abs(spendHosp-HA_Allowed_POSGRP_Hospital)< .1
assert abs(spendOthr-HA_Allowed_POSGRP_Other)   < .1

* 31 visits is maximum possible number for a single month
foreach v in Rx Off Hosp Othr {
	assert inrange(visit`v',0,31)
}

* Coverage indicator is equal to 1 if employee had at least one day of coverage in that month
assert inrange(month_cov_frac,0,1)
gen covg = month_cov_frac>0.001
compress

* Restriction: Set spending and visits equal to missing if no coverage in that month. (Note: there are a few months with positive spending despite no coverage -- unclear why this is. About 130 cases total across all categories using up through 01/19; less for earlier periods)
foreach v of varlist spend* visit* {
	count if `v'>0 & covg==0
	assert r(N)<130
	replace `v' = . if covg==0
}

* Calculate a winsorized (topcoded) spending measure
merge m:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/TreatmentAssignment-2016.dta", keepusing(StudyArm) assert(match) nogenerate
gen     treat = StudyArm!="Control":StudyArm
replace treat = . if StudyArm=="Not Randomized":StudyArm
label var treat "0=control, 1=treat"
qui foreach v in 005 01 025 05 10 15 20 25 {
	gen spendw`v' = spend
	local ptile = 100 - .`v'*100
	_pctile spendw`v' if !mi(treat) & inrange(MonthOfService,ym(2015,07),ym(2017,07)), p(`ptile')
	replace spendw`v' = r(r1) if spendw`v'>r(r1) & !mi(spendw`v')
	noi di "Percentile: `ptile'"	
}
drop treat StudyArm

* Calculate measures of total spending at the 1, 2, 3, 6, 12, 24, and 30-month marks -- will later be converted to "any spending"
bys AnalysisID: egen anyspend_0816_0816 = total(spend) if inrange(MonthOfService,ym(2016,08),ym(2016,08)), missing
by  AnalysisID: egen anyspend_0816_0916 = total(spend) if inrange(MonthOfService,ym(2016,08),ym(2016,09)), missing
by  AnalysisID: egen anyspend_0816_1016 = total(spend) if inrange(MonthOfService,ym(2016,08),ym(2016,10)), missing
by  AnalysisID: egen anyspend_0816_0117 = total(spend) if inrange(MonthOfService,ym(2016,08),ym(2017,01)), missing
by  AnalysisID: egen anyspend_0816_0717 = total(spend) if inrange(MonthOfService,ym(2016,08),ym(2017,07)), missing
by  AnalysisID: egen anyspend_0816_0119 = total(spend) if inrange(MonthOfService,ym(2016,08),ym(2019,01)), missing

* Calculate average spending, and average visits, for pre- and post- periods: 
*    (1) July 2015 - July 2016 (pre-period) 
*    (2) August 2016 - July 2017 (one-year post-period)
*    (3) August 2016 - Jan 2019 (30-month post-period)

foreach v of varlist spend* visit* covg {
	bys AnalysisID: egen `v'_0715_0716 = mean(`v') if inrange(MonthOfService,ym(2015,07),ym(2016,07))
	by  AnalysisID: egen `v'_0816_0717 = mean(`v') if inrange(MonthOfService,ym(2016,08),ym(2017,07))
	by  AnalysisID: egen `v'_0816_0119 = mean(`v') if inrange(MonthOfService,ym(2016,08),ym(2019,01))
	drop `v'
}

collapse (mean) spend* visit* covg* anyspend*, by(AnalysisID) fast
compress
assert inrange(covg_0715_0716,0,1)
assert inrange(covg_0816_0717,0,1)
assert inrange(covg_0816_0119,0,1)

* Define nonzero spending variables
gen nonzero_spend_0715_0716 = spend_0715_0716>0 if !mi(spend_0715_0716)
gen nonzero_spend_0816_0717 = spend_0816_0717>0 if !mi(spend_0816_0717)
gen nonzero_spend_0816_0119 = spend_0816_0119>0 if !mi(spend_0816_0119)

* Define anyspending variables -- note that this is identical to the corresponding "nonzero" spending variable
foreach v of varlist anyspend* {
	replace `v' = `v'>0 if !mi(`v')
}
assert anyspend_0816_0717==nonzero_spend_0816_0717

label var spend_0715_0716         "Avg monthly spending, 7/15 - 7/16"
label var spend_0816_0717         "Avg monthly spending, 8/16 - 7/17"
label var spend_0816_0119         "Avg monthly spending, 8/16 - 1/19"

label var spendRx_0715_0716       "Avg monthly spending (Rx), 7/15 - 7/16"
label var spendRx_0816_0717       "Avg monthly spending (Rx), 8/16 - 7/17"
label var spendRx_0816_0119       "Avg monthly spending (Rx), 8/16 - 1/19"

label var spendHosp_0715_0716     "Avg monthly spending (hospital care), 7/15 - 7/16"
label var spendHosp_0816_0717     "Avg monthly spending (hospital care), 8/16 - 7/17"
label var spendHosp_0816_0119     "Avg monthly spending (hospital care), 8/16 - 1/19"

label var spendOff_0715_0716      "Avg monthly spending (office care), 7/15 - 7/16"
label var spendOff_0816_0717      "Avg monthly spending (office care), 8/16 - 7/17"
label var spendOff_0816_0119      "Avg monthly spending (office care), 8/16 - 1/19"

label var spendOthr_0715_0716     "Avg monthly spending (other), 7/15 - 7/16"
label var spendOthr_0816_0717     "Avg monthly spending (other), 8/16 - 7/17"
label var spendOthr_0816_0119     "Avg monthly spending (other), 8/16 - 1/19"

label var visitRx_0715_0716       "Avg monthly visits (Rx), 7/15 - 7/16"
label var visitRx_0816_0717       "Avg monthly visits (Rx), 8/16 - 7/17"
label var visitRx_0816_0119       "Avg monthly visits (Rx), 8/16 - 1/19"

label var visitHosp_0715_0716     "Avg monthly visits (hospital care), 7/15 - 7/16"
label var visitHosp_0816_0717     "Avg monthly visits (hospital care), 8/16 - 7/17"
label var visitHosp_0816_0119     "Avg monthly visits (hospital care), 8/16 - 1/19"

label var visitOff_0715_0716      "Avg monthly visits (office care), 7/15 - 7/16"
label var visitOff_0816_0717      "Avg monthly visits (office care), 8/16 - 7/17"
label var visitOff_0816_0119      "Avg monthly visits (office care), 8/16 - 1/19"

label var visitOthr_0715_0716     "Avg monthly visits (other), 7/15 - 7/16"
label var visitOthr_0816_0717     "Avg monthly visits (other), 8/16 - 7/17"
label var visitOthr_0816_0119     "Avg monthly visits (other), 8/16 - 1/19"

label var nonzero_spend_0715_0716 "Non-zero spending, 7/15 - 7/16"
label var nonzero_spend_0816_0717 "Non-zero spending, 8/16 - 7/17"
label var nonzero_spend_0816_0119 "Non-zero spending, 8/16 - 1/19"

label var covg_0715_0716          "Fraction months w/ health coverage, 7/15 - 7/16"
label var covg_0816_0717          "Fraction months w/ health coverage, 8/16 - 7/17"
label var covg_0816_0119          "Fraction months w/ health coverage, 8/16 - 1/19"

label var anyspend_0816_0816      "Any spending, 8/16 - 8/16"
label var anyspend_0816_0916      "Any spending, 8/16 - 9/16"
label var anyspend_0816_1016      "Any spending, 8/16 - 10/16"
label var anyspend_0816_0117      "Any spending, 8/16 - 1/17"
label var anyspend_0816_0717      "Any spending, 8/16 - 7/17"
label var anyspend_0816_0119      "Any spending, 8/16 - 1/19"

foreach v in Rx Hosp Off Othr {
	assert spend`v'_0715_0716 <= spend_0715_0716
	assert spend`v'_0816_0717 <= spend_0816_0717
	assert spend`v'_0816_0119 <= spend_0816_0119
	
	assert mi(spend`v'_0715_0716) if mi(spend_0715_0716)
	assert mi(spend`v'_0816_0717) if mi(spend_0816_0717)
	assert mi(spend`v'_0816_0119) if mi(spend_0816_0119)
	
	assert inrange(visit`v'_0715_0716,0,31) if !mi(visit`v'_0715_0716)
	assert inrange(visit`v'_0816_0717,0,31) if !mi(visit`v'_0816_0717)
	assert inrange(visit`v'_0816_0119,0,31) if !mi(visit`v'_0816_0119)
}

foreach d in 0715_0716 0816_0717 0816_0119 {
	assert abs(spend_`d' - (spendRx_`d'+spendHosp_`d'+spendOff_`d'+spendOthr_`d'))<0.01 if !mi(spend_`d')
}

compress
save "`claims_data'", replace

**************************************************************************************
**** Campus Rec data
**************************************************************************************
tempfile campus_rec_data
use "$Wellness_WhatDoesWWDo/data/raw/CampusRec-2019-01-31.dta", clear

* Calculate number of gym days for the following periods: (1) August 2015 - July 2016; (2) August 2016 - July 2017; and (3) August 2016 - January 2019 (30 months);
* Note: we do not normalize by employment duration  because it looks like some individuals (e.g., retirees) continue using the gym for a while.
bys AnalysisID: egen gym_0815_0716 = sum(CRC_SwipeDays) if inrange(SwipeMonth,ym(2015,08),ym(2016,07))
bys AnalysisID: egen gym_0816_0717 = sum(CRC_SwipeDays) if inrange(SwipeMonth,ym(2016,08),ym(2017,07))
bys AnalysisID: egen gym_0816_0119 = sum(CRC_SwipeDays) if inrange(SwipeMonth,ym(2016,08),ym(2019,01))

collapse (mean) gym_*, by(AnalysisID) fast

* Normalize post-period years that are longer than 1 year
replace gym_0816_0119 = gym_0816_0119/2.5

foreach t in 0815_0716 0816_0717 0816_0119 {
	summ gym_`t'
	assert inrange(gym_`t',0,365)
}

label var gym_0815_0716 "Campus rec visits (days/year), 8/15 - 7/16"
label var gym_0816_0717 "Campus rec visits (days/year), 8/16 - 7/17"
label var gym_0816_0119 "Campus rec visits (days/year), 8/16 - 1/19"

compress
save "`campus_rec_data'", replace


**************************************************************************************
**** Strata variables and treatment assignment data
**************************************************************************************
tempfile treatment_assignment_data

use "$Wellness_WhatDoesWWDo/data/raw/TreatmentAssignment-2016.dta", clear
keep AnalysisID StudyArm StudyArm_ScreenHRA StudyArm_Activities Strata s?_*

gen     treat = StudyArm!="Control":StudyArm
replace treat = . if StudyArm=="Not Randomized":StudyArm
label var treat "0=control, 1=treat"

foreach v of varlist s?_* {
	assert !mi(`v')
}
gen faculty = s1_eclass=="Faculty"
gen AP      = s1_eclass=="AP"
gen CS      = s1_eclass=="CS"
label var faculty "Faculty (strata var)"
label var AP      "Academic Professional (strata var)"
label var CS      "Civil Service (strata var)"

gen age50    = s3_age=="age50+"
gen age37_49 = s3_age=="age37-49"
gen age36    = s3_age=="age36-"
label var age50    "Ages 50 and over (strata var)"
label var age37_49 "Ages 37-49 (strata var)"
label var age36    "Ages 36 and under (strata var)"

gen salaryM2 = s4_salary2=="Rich"
gen salaryM1 = s4_salary2=="Poor"
label var salaryM2 "Salary above study participant median (strata var)"
label var salaryM1 "Salary below study participant median (strata var)"

gen salaryQ4 = s5_salary4=="Salary Q4"
gen salaryQ3 = s5_salary4=="Salary Q3"
gen salaryQ2 = s5_salary4=="Salary Q2"
gen salaryQ1 = s5_salary4=="Salary Q1"
label var salaryQ4 "Top quartile of study participant salary (strata var)"
label var salaryQ3 "Third quartile of study participant salary (strata var)"
label var salaryQ2 "Second quartile of study participant salary (strata var)"
label var salaryQ1 "Bottom quartile of study participant salary (strata var)"

gen white = s6_race=="White"
label var white "HR race is white (strata var)"
drop s?_*

compress
save "`treatment_assignment_data'", replace

**************************************************************************************
****** 2018 online survey
**************************************************************************************

tempfile survey_2018

* Initial dataset N=4,834
use "$Wellness_WhatDoesWWDo/data/raw/FollowupSurvey-2018.dta", clear

merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/HR_Baseline.dta", assert(match using) nogenerate keepusing(Gender)

***
* Clean variables common across 2016/2017/2018 surveys (2016 is strict subset of 2017; 2017 and 2018 vars are the same)
***
clean_survey_data
clean_survey_data_2017

* Define vars of interest
local svy_hvars "everscreen active active_try cursmk othersmk formsmk drink drinkhvy chronic health1 health2 problems energy ehealth overweight badhealth sedentary"
local svy_uvars "druguse physician hospital"
local productivity_vars2016 "sickdays hrsworked50 jobsatisf1 jobsatisf2	mgmtsafety"
local productivity_vars2017 "happywork presenteeism productive promotion jobsearch1 jobsearch2"

* Keep and rename vars of interest
keep AnalysisID `svy_hvars' `svy_uvars' `productivity_vars2016' `productivity_vars2017' iThrive_talk

ren * *_0718
ren AnalysisID_0718 AnalysisID

gen survey2018_c = 1
label var survey2018_c "Completed 2018 follow-up survey"

compress
save "`survey_2018'", replace

**************************************************************************************
****** 2017 online survey
**************************************************************************************

tempfile survey_2017

* Initial dataset N=4,834
use "$Wellness_WhatDoesWWDo/data/raw/FollowupSurvey-2017.dta", clear

merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/HR_Baseline.dta", assert(match using) nogenerate keepusing(Gender)

* One individual skipped almost every single question, so drop him (i.e., treat as incomplete survey)
foreach v of varlist Exercise_DrRecIncreaseActivity-Background_RaceEth_OtherSpecifie {
	assert mi(`v') if AnalysisID==110567
}
drop if AnalysisID==110567

***
* Clean variables common across 2016/2017/2018 surveys (2016 is strict subset of 2017; 2017 and 2018 vars are the same)
***
clean_survey_data
clean_survey_data_2017

* Keep and rename vars of interest
keep AnalysisID `svy_hvars' `svy_uvars' `productivity_vars2016' `productivity_vars2017' iThrive_talk

ren * *_0717
ren AnalysisID_0717 AnalysisID

gen survey2017_c = 1
label var survey2017_c "Completed 2017 follow-up survey"

compress
save "`survey_2017'", replace

**************************************************************************************
****** 2016 baseline survey
**************************************************************************************

* Initial dataset N=12,459
use "$Wellness_WhatDoesWWDo/data/raw/BaselineSurvey-2016.dta", clear

* Merge on HR data to get Gender variable (needed for drinking question)
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/HR_Baseline.dta", assert(match) nogenerate keepusing(Gender)

* Clean variables common across 2016 and 2017 surveys (2016 is strict subset of 2017)
clean_survey_data

keep AnalysisID male `svy_hvars' `svy_uvars' `productivity_vars2016'

**************************************************************************************
*** Merge together all the datasets
**************************************************************************************

* Merge on wellness activities completion data
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/Activities-2018.dta", assert(match master) nogenerate keepusing(RegisteredActivityFall* CompletedActivityFall* RegisteredActivitySpring* CompletedActivitySpring*)

* Merge on HRA completion data
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/HRA-2016.dta", assert(match master) nogenerate keepusing(CompletedHRA)
ren CompletedHRA hra_c
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/HRA-2017.dta", assert(match master) nogenerate keepusing(CompletedHRA)
ren CompletedHRA hra_c_yr2

* Ever scheduled a screening?
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/AcuityScheduling-2016.dta", assert(match master) nogenerate keepusing(ScheduledScreening)

* Completed a 2016 screening (treatment group only)
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/Biometrics_DigitizedByRAs-2016.dta", assert(match master) nogenerate keepusing(CompletedScreening)

assert _N==4834

ren RegisteredActivityFall16 activity_f_r
ren CompletedActivityFall16  activity_f_c
ren RegisteredActivitySpring17 activity_s_r
ren CompletedActivitySpring17 activity_s_c

ren RegisteredActivityFall17 activity_f_r_yr2
ren CompletedActivityFall17  activity_f_c_yr2
ren RegisteredActivitySpring18 activity_s_r_yr2
ren CompletedActivitySpring18 activity_s_c_yr2

ren ScheduledScreening screening_r
ren CompletedScreening screening_c

* Completed 2017/2018 screenings (control/treatment group)
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/Biometrics_DigitizedByRAs-2017.dta", assert(match) nogenerate keepusing(CompletedScreening)
ren CompletedScreening screening2017_c
label var screening2017_c "Completed 2017 follow-up screening"

merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/Biometrics_DigitizedByRAs-2018.dta", assert(match) nogenerate keepusing(CompletedScreening)
ren CompletedScreening screening2018_c
label var screening2018_c "Completed 2018 follow-up screening"

* Merge on random assignment data for all years, plus 2016 stratification variables, and increase sample to include all employees
merge 1:1 AnalysisID using "`treatment_assignment_data'", assert(match using) keep(match using) nogenerate
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/TreatmentAssignment-2017.dta", assert(match) nogenerate keepusing(StudyArm_2017)
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/TreatmentAssignment-2018.dta", assert(match) nogenerate keepusing(StudyArm_2018)
assert _N==12459


* Merge on HR baseline data for all employees
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/HR_Baseline.dta", assert(match) nogenerate keepusing(Gender BirthYear JobSalary)
ren JobSalary salary_0616

* Merge on health insurance claims data
merge 1:1 AnalysisID using "`claims_data'", assert(match) nogenerate

* Merge on HR absenteeism, employment, title, and updated salary data for all employees. Create additional productivity variables
merge 1:1 AnalysisID using "`hr_absenteeism_data'", assert(match) nogenerate

gen salaryRaise_0616_0717 = (salary_0717/salary_0616)-1
gen salaryRaise_0616_0119 = (salary_0119/salary_0616)-1
label var salaryRaise_0616_0717 "Salary raise between 6/16 and 7/17 (share of 6/16 salary)"
label var salaryRaise_0616_0119 "Salary raise between 6/16 and 1/19 (share of 6/16 salary)"

gen titleChange_0616_0717 = title_0616!=title_0717 if !mi(salaryRaise_0616_0717)
gen titleChange_0616_0119 = title_0616!=title_0119 if !mi(salaryRaise_0616_0119)
label var titleChange_0616_0717 "Title change between 6/16 and 7/17"
label var titleChange_0616_0119 "Title change between 6/16 and 1/19"

gen promotion_0616_0717 = titleChange_0616_0717*(salaryRaise_0616_0717 >= 0) if !mi(salaryRaise_0616_0717)
gen promotion_0616_0119 = titleChange_0616_0119*(salaryRaise_0616_0119 >= 0) if !mi(salaryRaise_0616_0119)
label var promotion_0616_0717 "Promotion between 6/16 and 7/17"
label var promotion_0616_0119 "Promotion between 6/16 and 1/19"
drop title_????

* Merge on campus rec (swipe card) data
merge 1:1 AnalysisID using "`campus_rec_data'", assert(match) nogenerate

* Merge on Marathon data
merge 1:1 AnalysisID using "$Wellness_WhatDoesWWDo/data/raw/Marathon-2018.dta", assert(match) keepusing(match2014_any match2015_any match2016_any match2017_any match2018_any) nogenerate
ren match2014_any marathon_2014
ren match2015_any marathon_2015
ren match2016_any marathon_2016
ren match2017_any marathon_2017
ren match2018_any marathon_2018
gen marathon_2014_2016 = marathon_2014 | marathon_2015 | marathon_2016
label var marathon_2014_2016 "IL Marathon participant anytime 2014-2016"
forval v = 2014/2018 {
	label var marathon_`v' "IL Marathon participant in `v'"
}

* Merge on 2017 follow-up survey
merge 1:1 AnalysisID using "`survey_2017'", assert(match master) nogenerate
replace survey2017_c = 0 if mi(survey2017_c)
order survey2017_c, after(screening_c)

* Merge on 2018 follow-up survey
merge 1:1 AnalysisID using "`survey_2018'", assert(match master) nogenerate
replace survey2018_c = 0 if mi(survey2018_c)
order survey2018_c, after(screening_c)

gen age = 2016 - BirthYear
gen hra_c_nomiss = hra_c
replace hra_c_nomiss = 0 if missing(hra_c_nomiss)
label var hra_c_nomiss "Completed 2016 Wellsource HRA (missing coded as 0)"
label var hra_c        "Completed 2016 Wellsource HRA"
label var hra_c_yr2    "Completed 2017 Wellsource HRA"

assert inlist(Gender,"M","F")
drop male
gen male = Gender=="M"
assert !mi(male)

**************************************************************************************
*** Create productivity indices, based on the first principle component
**************************************************************************************

* Note: terminated var is nearly perfectly collinear with titleChange/salaryRaise, since those latters vars are undefined for people who are no longer employed
local svy_prodvars "sickdays hrsworked50 jobsatisf1 jobsatisf2 mgmtsafety"
local admin_prodvars "sickleave_0815_0716 salary_0616"

local job_vars_survey_0717 "sickdays_0717 hrsworked50_0717 jobsatisf1_0717 jobsatisf2_0717 mgmtsafety_0717 happywork_0717 productive_0717 promotion_0717 jobsearch1_0717 jobsearch2_0717"
local job_vars_admin_0717  "salaryRaise_0616_0717 promotion_0616_0717 titleChange_0616_0717 terminated_0717 sickleave_0816_0717"

local job_vars_survey_0718 "sickdays_0718 hrsworked50_0718 jobsatisf1_0718 jobsatisf2_0718 mgmtsafety_0718 happywork_0718 productive_0718 promotion_0718 jobsearch1_0718 jobsearch2_0718"
local job_vars_admin_0119  "salaryRaise_0616_0119 promotion_0616_0119 titleChange_0616_0119 terminated_0119 sickleave_0816_0119"

local prod_vars_yr0 "`svy_prodvars' `admin_prodvars'"
local prod_vars_yr1 "`job_vars_survey_0717' `job_vars_admin_0717'"
local prod_vars_yr2 "`job_vars_survey_0717' `job_vars_admin_0119'"

cap mkdir "$Wellness_WhatDoesWWDo/results/intermediate_files/productivity"
tempfile t
qui foreach v in yr0 yr1 yr2 {

	pca `prod_vars_`v'', components(1)
	predict prod_index_`v'
	matrix loadings_`v' = e(L)

	preserve
		clear
		local rownames : rowfullnames loadings_`v'
		local c : word count `rownames'
		svmat loadings_`v', names("loadings_`v'_")
		gen var = ""
		forvalues i = 1/`c' {
			replace var = "`:word `i' of `rownames''" in `i'
		}
		save "$Wellness_WhatDoesWWDo/results/intermediate_files/productivity/prod_index_`v'_loadings.dta", replace	
	restore
}
label var prod_index_yr0 "Baseline productivity index, based on svy and admin vars"
label var prod_index_yr1 "Year 1 productivity index, based on 2017 svy and 07/17 admin vars"
label var prod_index_yr2 "Year 2 productivity index, based on 2018 svy and 01/19 admin vars"


**************************************************************************************
*** Create the main dataset for subsequent analysis
**************************************************************************************
compress
assert _N==12459
save "$Wellness_WhatDoesWWDo/data/proc/wellness_analysis.dta", replace

* Copy documentation of variable definitions to results folder
copy "$Wellness_WhatDoesWWDo/data/raw/documentation/appendix_varlist.tex" "$Wellness_WhatDoesWWDo/results/tables/appendix_varlist.tex", replace

** EOF
